{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import buckaroo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def bimodal(mean_1, mean_2, N, na=0, sigma=5, sigma2=5):\n",
    "    X1 = np.random.normal(mean_1, sigma, int(N/2))\n",
    "    X2 = np.random.normal(mean_2, sigma2, int(N/2))\n",
    "    X = np.concatenate([X1, X2])\n",
    "    if na > 0:\n",
    "        na_count = int(np.round(N * na))\n",
    "        raw_vals = X.tolist()[:-na_count]\n",
    "        raw_vals.extend([pd.NA] * na_count)\n",
    "        return pd.Series(raw_vals, dtype='Float32')\n",
    "    else:\n",
    "        return X\n",
    "def rand_cat(named_p, na_per, N):\n",
    "    choices, p = [], []\n",
    "    named_total_per = sum(named_p.values()) + na_per\n",
    "    total_len = int(np.floor(named_total_per * N))\n",
    "    if named_total_per > 0:\n",
    "        for k, v in named_p.items():\n",
    "            choices.append(k)\n",
    "            p.append(v/named_total_per)\n",
    "\n",
    "        choices.append(pd.NA)\n",
    "        p.append(na_per/named_total_per)    \n",
    "        return [np.random.choice(choices, p=p) for k in range(total_len)]\n",
    "    else:\n",
    "        return []\n",
    "\n",
    "def random_categorical(named_p, unique_per, na_per, longtail_per, N):\n",
    "    choice_arr = rand_cat(named_p, na_per, N)\n",
    "    discrete_choice_len = len(choice_arr)\n",
    "\n",
    "    longtail_count = int(np.floor(longtail_per * N))//2\n",
    "    extra_arr = []\n",
    "    for i in range(longtail_count):\n",
    "        extra_arr.append(\"long_%d\" % i)\n",
    "        extra_arr.append(\"long_%d\" % i)\n",
    "\n",
    "    unique_len = N - (len(extra_arr) + discrete_choice_len)\n",
    "    #print(\"discrete_choice_len\", discrete_choice_len, \"longtail_count\", longtail_count, \"unique_len\", unique_len)\n",
    "    for i in range(unique_len):\n",
    "        extra_arr.append(\"unique_%d\" % i)\n",
    "    all_arr = np.concatenate([choice_arr, extra_arr])\n",
    "    np.random.shuffle(all_arr)\n",
    "    return all_arr        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "180f8f66-d39b-4cdb-bd7e-3c3af9cee939",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "pd.DataFrame({'bimodal_with_nan': bimodal(20,40,200, .2)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "N = 4000\n",
    "pd.DataFrame({\n",
    "    'all_NA' :          [pd.NA] * N,\n",
    "    'half_NA' :         random_categorical({1: .5}, unique_per=0,   na_per=.5, longtail_per=.0, N=N),\n",
    "    'longtail' :        random_categorical({},      unique_per=0,   na_per=.3, longtail_per=.7, N=N),\n",
    "    'longtail_unique' : random_categorical({},      unique_per=0.5, na_per=.0, longtail_per=.5, N=N),\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#kitchen sink\n",
    "pd.DataFrame({\n",
    "    'one': [1]*N, \n",
    "    'all_NA': [pd.NA] * N,\n",
    "\n",
    "    'log_normal': np.random.lognormal(25, .3, N),\n",
    "    'bimodal': bimodal(20,40, N, na=.2),\n",
    "    'all_unique_N':[i for i in range(N)],\n",
    "\n",
    "    'cat_1':random_categorical({'foo': .35, 'bar': .2}, unique_per=.25, na_per=.2, longtail_per=0, N=N),\n",
    "    'all_unique_cat':[str(i)+ \"_a\" for i in range(N)],\n",
    "    'cat_long_tail': random_categorical({},      unique_per=0,   na_per=.2, longtail_per=.8, N=N),\n",
    "\n",
    "    'cat_3':random_categorical({'foo': .3, 'bar': .05, 'baz':.1}, unique_per=.1, na_per=.25, longtail_per=.2, N=N),\n",
    "    'cat_4':random_categorical({'foo': .3, 'bar': .05, 'baz':.1}, unique_per=.3, na_per=.25, longtail_per=0, N=N)\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "pd.DataFrame({\n",
    "    'bimodal' :  bimodal(20,40, N),\n",
    "    'exponential' :  np.random.exponential(1.0, N) * 10 ,\n",
    "    'geometric': np.random.geometric(.2, N) * 10,\n",
    "    'log_normal': np.random.lognormal(25, .3, N),\n",
    "    'normal': np.random.normal(25, .3, N),\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#categoricals\n",
    "pd.DataFrame({\n",
    "    'two_vals':                random_categorical({1: .75, 2: .25}, unique_per=0, na_per=0, longtail_per=0, N=N),\n",
    "    'dominant_categories':     random_categorical({'foo': .6, 'bar': .25, 'baz':.15}, unique_per=0, na_per=0, longtail_per=0, N=N),\n",
    "    'dispersed_categories':    random_categorical({'foo': .3, 'bar': .05, 'baz':.1}, unique_per=.3, na_per=.25, longtail_per=0, N=N),\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb9f9eb6-5f09-44b4-8ed1-0ef54886cced",
   "metadata": {},
   "outputs": [],
   "source": [
    " "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
